Authors: Mauro Venticinque, Angelo Schillaci, Daniele Tambone
GitHub project: Bank-Marketing
Date: 2025-03-28
Here we will write some information about the project.
datatable(head(train, 100), options = list(scrollX = TRUE))
str(train)
## 'data.frame': 32950 obs. of 22 variables:
## $ X : int 35248 39854 14530 27822 40199 21227 16836 39099 38565 38152 ...
## $ age : int 30 39 43 27 56 41 57 46 61 35 ...
## $ job : chr "blue-collar" "technician" "services" "student" ...
## $ marital : chr "married" "married" "single" "single" ...
## $ education : chr "professional.course" "university.degree" "high.school" "high.school" ...
## $ default : chr "no" "no" "no" "no" ...
## $ housing : chr "no" "yes" "no" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "cellular" "cellular" "cellular" "cellular" ...
## $ month : chr "may" "jun" "jul" "mar" ...
## $ day_of_week : chr "fri" "mon" "tue" "thu" ...
## $ duration : int 1357 713 1317 80 230 697 1441 679 106 234 ...
## $ campaign : int 4 2 4 4 2 2 2 1 2 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 1 0 0 0 1 0 0 0 1 0 ...
## $ poutcome : chr "failure" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num -1.8 -1.7 1.4 -1.8 -1.7 1.4 1.4 -3 -3.4 -3.4 ...
## $ cons.price.idx: num 92.9 94.1 93.9 92.8 94.2 ...
## $ cons.conf.idx : num -46.2 -39.8 -42.7 -50 -40.3 -36.1 -42.7 -33 -26.9 -29.8 ...
## $ euribor3m : num 1.25 0.72 4.96 1.65 0.87 ...
## $ nr.employed : num 5099 4992 5228 5099 4992 ...
## $ subscribed : chr "yes" "yes" "yes" "yes" ...
attach(train)
X (Integer): id of customerage (Integer): age of the customerjob (Categorical): occupationmarital (Categorical): marital statuseducation (Categorical): education leveldefault (Binary): has credit in default?housing (Binary): has housing loan?loan (Binary): has personal loan?contact (Categorical): contact communication typemonth (Categorical): last contact month of yearday_of_week (Integer): last contact day of the
weekduration (Integer): last contact duration, in seconds
(numeric). Important note: this attribute highly affects the output
target (e.g., if duration=0 then y=‘no’). Yet, the duration is not known
before a call is performed. Also, after the end of the call y is
obviously known. Thus, this input should only be included for benchmark
purposes and should be discarded if the intention is to have a realistic
predictive modelcampaign (Integer): number of contacts performed during
this campaign and for this client (numeric, includes last contact)pdays (Integer): number of days that passed by after
the client was last contacted from a previous campaign (numeric; -1
means client was not previously contacted)previous (Integer): number of contacts performed before
this campaign and for this clientpoutcome (Categorical): outcome of the previous
marketing campaign (categorical: ‘failure’,‘nonexistent’,‘success’)subscribed (Binary): has the client subscribed a term
deposit?Source: UCI Machine Learning Repository
vis_dat(train)
skim(train)
| Name | train |
| Number of rows | 32950 |
| Number of columns | 22 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| job | 0 | 1 | 6 | 13 | 0 | 12 | 0 |
| marital | 0 | 1 | 6 | 8 | 0 | 4 | 0 |
| education | 0 | 1 | 7 | 19 | 0 | 8 | 0 |
| default | 0 | 1 | 2 | 7 | 0 | 3 | 0 |
| housing | 0 | 1 | 2 | 7 | 0 | 3 | 0 |
| loan | 0 | 1 | 2 | 7 | 0 | 3 | 0 |
| contact | 0 | 1 | 8 | 9 | 0 | 2 | 0 |
| month | 0 | 1 | 3 | 3 | 0 | 10 | 0 |
| day_of_week | 0 | 1 | 3 | 3 | 0 | 5 | 0 |
| poutcome | 0 | 1 | 7 | 11 | 0 | 3 | 0 |
| subscribed | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| X | 0 | 1 | 20622.42 | 11882.00 | 1.00 | 10346.50 | 20629.50 | 30883.75 | 41188.00 | ▇▇▇▇▇ |
| age | 0 | 1 | 40.04 | 10.45 | 17.00 | 32.00 | 38.00 | 47.00 | 98.00 | ▅▇▃▁▁ |
| duration | 0 | 1 | 258.66 | 260.83 | 0.00 | 102.00 | 180.00 | 318.00 | 4918.00 | ▇▁▁▁▁ |
| campaign | 0 | 1 | 2.57 | 2.77 | 1.00 | 1.00 | 2.00 | 3.00 | 43.00 | ▇▁▁▁▁ |
| pdays | 0 | 1 | 961.90 | 188.33 | 0.00 | 999.00 | 999.00 | 999.00 | 999.00 | ▁▁▁▁▇ |
| previous | 0 | 1 | 0.17 | 0.49 | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | ▇▁▁▁▁ |
| emp.var.rate | 0 | 1 | 0.08 | 1.57 | -3.40 | -1.80 | 1.10 | 1.40 | 1.40 | ▁▃▁▁▇ |
| cons.price.idx | 0 | 1 | 93.57 | 0.58 | 92.20 | 93.08 | 93.75 | 93.99 | 94.77 | ▁▆▃▇▂ |
| cons.conf.idx | 0 | 1 | -40.49 | 4.63 | -50.80 | -42.70 | -41.80 | -36.40 | -26.90 | ▅▇▁▇▁ |
| euribor3m | 0 | 1 | 3.62 | 1.74 | 0.63 | 1.34 | 4.86 | 4.96 | 5.04 | ▅▁▁▁▇ |
| nr.employed | 0 | 1 | 5167.01 | 72.31 | 4963.60 | 5099.10 | 5191.00 | 5228.10 | 5228.10 | ▁▁▃▁▇ |
plot_ly(train, x = subscribed, type = 'histogram')
corrplot(cor(train[, c("X", "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed")]), method="pie")
plot_ly(train, x = job, y = age, type = 'box', color = job)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_ly(train, x = education, y = age, type = 'box', color = education)
ord_edu <- train %>% count(education) %>%arrange(n)%>% pull(education)
eduResp <- ggplot(train, aes(x = factor(education, levels = ord_edu), fill = factor(subscribed))) +
geom_bar(position = "fill") +
coord_flip() +
ylab("Proportion") +
scale_fill_discrete(name = "Subscribed") +
xlab("Education") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
eduFreq <- ggplot(as.data.frame(table(education)/sum(table(education))*100), aes(x = reorder(education, Freq), y = Freq)) +
geom_bar(stat = "identity", color = "gray", fill = "steelblue", alpha=0.9) +
coord_flip() +
labs(title = "Education", x = "Education Level", y = "Count") +
theme_minimal()
eduFreq / eduResp
ordine_poutcome <- train %>% count(poutcome) %>% arrange(n) %>%
pull(poutcome)
poutcomeFreq <- ggplot(as.data.frame(table(train$poutcome) / length(train$poutcome) * 100),
aes(x = reorder(Var1, Freq), y = Freq, fill = Var1)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(
title = "Distribution of Poutcome",
x = "Outcome previous campaign",
y = "Percentage (%)"
) +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(legend.position = "none")
poutcomeResp <- ggplot(train, aes(x = factor(poutcome, levels = ordine_poutcome), fill = factor(subscribed))) +
geom_bar(position = "fill") +
coord_flip() +
ylab("Proportion") +
scale_fill_discrete(name = "Poutcome") +
xlab("Outcome previous campaign") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal()
(poutcomeFreq / poutcomeResp) +
plot_layout(axis_titles = 'collect')
ggplot(train, aes(age)) + geom_histogram(binwidth=4,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)
ordine_job <- train %>% count(job) %>%arrange(n)%>% pull(job)
jobFreq <- ggplot(as.data.frame(table(train$job) / length(train$job) * 100),
aes(x = reorder(Var1, Freq), y = Freq, fill = Var1)) +
geom_bar(stat = "identity", color = "gray", fill = "steelblue", alpha=0.9) +
coord_flip() +
labs(
title = "Distribution of job",
x = "Outcome of the previous campaign",
y = "Percentage (%)"
) +
theme_minimal() +
theme(legend.position = "none")
jobResp <- ggplot(train, aes(x = factor(job, levels = ordine_job), fill = factor(subscribed))) +
geom_bar(position = "fill") +
coord_flip() +
labs(
title = "Proportion by subscribed",
x = "Default",
y = "Proportion"
) +
scale_fill_discrete(name = "Subscribed") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal()
jobFreq / jobResp
ggplot(train, aes(cons.price.idx)) + geom_histogram(binwidth=2,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)
ggplot(train, aes(cons.conf.idx)) + geom_histogram(binwidth=3,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)
ggplot(train, aes(euribor3m)) + geom_histogram(binwidth=3,position="fill",aes(fill=factor(subscribed)))+scale_fill_discrete(name="Subscribed")+ylab("proportion")+geom_hline(yintercept=0.5)
train$day_of_week <- factor(train$day_of_week,
levels = c("mon", "tue", "wed", "thu", "fri"),
ordered = TRUE)
dayFreq <- ggplot(as.data.frame(table(train$day_of_week)/length(train$day_of_week)*100), aes(x = Var1, y = Freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Distribution of Day of Week",
x = "Last Contact Day of Week",
y = "Percentage (%)")+
theme_minimal()
dayResp <- ggplot(train, aes(x = day_of_week, fill = factor(subscribed))) +
geom_bar(position = "fill") +
coord_flip() +
ylab("Proportion") +
scale_fill_discrete(name = "Subscribed") +
xlab("Last Contact Day of Week") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal()
(dayFreq / dayResp) +
plot_layout(axis_titles = 'collect')
ordine_month<-factor(train$month,
levels = c("mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"),
ordered = TRUE)
monthFreq <- ggplot(as.data.frame(table(ordine_month)/length(ordine_month)*100), aes(x = ordine_month, y = Freq)) +
geom_bar(stat = "identity",color='gray', fill = "steelblue") +
coord_flip() +
labs(title = "Distribution of month",
x = "Last contact month of year",
y = "Percentage (%)")+
theme_minimal()
monthResp <- ggplot(train, aes(x = ordine_month, fill = factor(subscribed))) +
geom_bar(position = "fill") +
coord_flip() +
ylab("Proportion") +
scale_fill_discrete(name = "month") +
xlab("Last contact month of year") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal()
(monthFreq / monthResp) +
plot_layout(axis_titles = 'collect')
ordine_previous <- train %>% count(previous) %>% arrange(n) %>%
pull(previous)
prevFreq <- ggplot(as.data.frame(table(train$previous)/length(train$previous)*100), aes(x = reorder(Var1,Freq), y = Freq)) +
geom_bar(stat = "identity",color='gray', fill = "steelblue") +
coord_flip() +
labs(title = "Distribution of Previous",
x = "Number of calls previous campain",
y = "Percentage (%)")+
theme_minimal()
prevResp <- ggplot(train, aes(x = factor(previous, levels = ordine_previous), fill = factor(subscribed))) +
geom_bar(position = "fill") +
coord_flip() +
ylab("Proportion") +
scale_fill_discrete(name = "Previous") +
xlab("Number of calls previous campain") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal()
(prevFreq / prevResp) +
plot_layout(axis_titles = 'collect')
train$emp_cat <- ifelse(train$emp.var.rate < 0, "Negative", "Positive or Zero")
ordine_emp <- train %>% count(emp_cat) %>% arrange(n) %>%
pull(emp_cat)
empFreq <- ggplot(as.data.frame(table(ordine_emp)/length(ordine_emp)*100), aes(x = ordine_emp, y = Freq)) +
geom_bar(stat = "identity",color='gray', fill = "steelblue") +
coord_flip() +
labs(title = "Distribution of Employment Variation (±)",
x = "Employment Variation (±)",
y = "Percentage (%)")+
theme_minimal()
empResp <- ggplot(train, aes(x = factor(emp_cat, levels = ordine_emp), fill = factor(subscribed))) +
geom_bar(position = "fill") +
coord_flip() +
ylab("Proportion") +
scale_fill_discrete(name = "emp_cat") +
xlab("Employment Variation (±)") +
geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
theme_minimal()
(empFreq / empResp) +
plot_layout(axis_titles = 'collect')
ggpairs(train[, c("age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed")], columns = 1:10,
lower = list(continuous = wrap("points", alpha = 0.5, color = "darkred", size=0.5)),
title='Scatterplot', axisLabels='none')
1.1.3 Social and economic context attributes
emp.var.rate(Integer): employment variation rate - quarterly indicatorcons.price.idx(Integer): consumer price index - monthly indicatorcons.conf.idx(Integer): consumer confidence index - monthly indicatoreuribor3m(Integer): euribor 3 month rate - daily indicatornr.employed(Integer): number of employees - quarterly indicator